# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #  
#
#' @title  Create time-series indicators of parties' anti-elite strategies
#' @author Hauke Licht
#' 
#' @note: Party tweet-level estimates of tweets' labels and label class 
#'         probabilities come from fine-tuned XLM-T classifier
#
# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #

# setup -----

library(readr)
library(dplyr)
library(lubridate)
library(tidyr)
library(purrr)
library(ggplot2)

base_path <- file.path(".")
data_path <- file.path(base_path, "data")
exdata_path <- file.path(data_path, "exdata")
input_path <- file.path(data_path, "input")
output_path <- file.path(data_path, "output")

# load predictions for old and new tweets ----

# a) political ----

tweets_political <- read_rds(file.path(input_path, "all_tweets_classified_political.rds"))

# b) elite criticism ----

fp <- file.path(output_path, "all_tweets_classified_elitecriticism.csv")
tweets_elitecriticism <- read_csv(fp, col_types = "ccd")

tweets_elitecriticism <- tweets_elitecriticism %>% 
  mutate(elitecriticism = factor(pred_prob > .5, c(T, F), c("yes", "no"))) %>% 
  rename(prob_elitecriticism = pred_prob)

# load labeled party tweets data ----

parl_party_tweets <- read_rds(file.path(input_path, "parl_party_tweets.rds"))

parties <- parl_party_tweets %>% 
  distinct(country_iso3c, party_name_short, party_id, screen_name) %>% 
  arrange(country_iso3c, party_name_short, screen_name)

(years <- sort(unique(year(parl_party_tweets$created_at))))

tweet_timeseries <- parl_party_tweets %>% 
  mutate(
    year = year(created_at)
    , halfyear = factor(
      sprintf("%04d-%02d", year, (quarter(created_at)>2)+1)
      , paste0(rep(years, each = 2), "-0", 1:2)
    )
    , quarter = factor(
      sprintf("%04d-%02d", year, quarter(created_at))
      , paste0(rep(years, each = 4), "-0", 1:4)
    )
    , month = factor(
      sprintf("%04d-%02d", year, month(created_at))
      , paste0(rep(years, each = 12), "-", sprintf("%02d", 1:12))
    )
    , week = factor(
      # with ISO leap years
      sprintf("%04d-%02d", year, isoweek(created_at))
      , paste0(rep(years, each = 53), "-", sprintf("%02d", 1:53))
    )
    , year = factor(year, as.character(years))
  ) %>%
  # add labeled data
  left_join(
    select(tweets_political, country_iso3c, party_id, party_name_short, user_id, status_id, prob_political, political)
    , by = c("country_iso3c", "party_id", "party_name_short", "user_id", "status_id")
    , relationship = "many-to-many"
  ) %>%
  inner_join(
    tweets_elitecriticism
    , by = c("user_id", "status_id")
    , relationship = "many-to-many"
  ) %>%
  # order columns
  select(
    country_iso3c
    , election_date, election_date_next
    , pre_elec_period
    , elec_period_sdate, elec_period_edate
    , parl_config_sdate, parl_config_edate
    , party_id, party_name_short, user_id
    , year, halfyear, quarter, month, week
    , status_id, created_at
    , text, lang
    , prob_political, political 
    , prob_elitecriticism, elitecriticism
    , tweet_data_collected_at = data_collected_at
  )

fp <- file.path(output_path, "parl_party_tweets_labeled.rds")
if (!file.exists(fp))
  write_rds(tweet_timeseries, fp)

# create full schedule of party-time window units ----
configs <- list()

tmp <- tweet_timeseries %>% 
  group_by(
    country_iso3c
    , party_id, party_name_short
    , pre_elec_period
    , election_date, election_date_next
    , elec_period_sdate, elec_period_edate
    , parl_config_sdate, parl_config_edate
  ) %>% 
  summarise(last = as_date(max(created_at))) %>% 
  arrange_at(c(1, 3:6)) %>% 
  mutate(
    start = parl_config_sdate
    , end = case_when(
      is.na(parl_config_edate) ~ last
      , is.na(elec_period_edate) ~ last
      , TRUE ~ parl_config_edate
    )
  ) %>% 
  select(-last)
  
# all party--weeks
configs$week <- tmp %>% 
  mutate(
    week = seq(start, end, by = "week") %>% 
      {sprintf(fmt = "%04d-%02d", year(.), isoweek(.))} %>%
      list()
  ) %>% 
  select(-start, -end) %>% 
  unnest_longer(c(week)) %>% 
  arrange(country_iso3c, party_name_short, party_id, week) %>%
  group_by(country_iso3c, party_name_short, party_id, week) %>% 
  slice(1) %>% 
  ungroup()

#check
configs$week %>% 
  group_by(country_iso3c, party_name_short, party_id, week) %>% 
  filter(n() > 1)

# all party--months
months <- tmp %>% 
  mutate(
    month = seq(start, end, by = "month") %>% 
      {sprintf(fmt = "%04d-%02d", year(.), month(.))} %>%
      list()
  ) %>% 
  unnest_longer(c(month)) %>% 
  group_by(country_iso3c, party_name_short, party_id, month)

configs$month <- months %>% 
  # filter(party_id == 2078, month == "2012-06") %>% 
  arrange(parl_config_sdate) %>% 
  filter(n() > 1) %>% 
  # if there are two configs for one month, select the one with most number of days in month 
  mutate(
    r_ = row_number()
    , dim_ = if_else(r_ == 1, NA_integer_, days_in_month(start))
    , tmp_ = sprintf("%s-%02d", month, dim_)
    , tmp_ = ymd(if_else(r_ == 1, NA_character_, tmp_))
    , n_days_in_month = if_else(
      r_ == 1
      , end - ymd(paste(month, "01", sep = "-"))
      , (tmp_+days(1)) - start
    )
    , dim_ = NULL
    , tmp_ = NULL
  ) %>% 
  # select config with most days in month
  top_n(1, wt = n_days_in_month) %>% 
  # select first config if tie
  top_n(1, wt = -r_) %>% 
  bind_rows(filter(months, n() == 1)) %>% 
  ungroup() %>% 
  select(-start, -end, -r_, -n_days_in_month) %>% 
  arrange(country_iso3c, party_name_short, party_id, month)
 
#check
configs$month %>% 
  group_by(country_iso3c, party_name_short, party_id, month) %>% 
  filter(n() > 1)

# all party--quarters
configs$quarter <- tmp %>% 
  mutate(
    quarter = seq(start, end, by = "quarter") %>% 
      {sprintf(fmt = "%04d-%02d", year(.), quarter(.))} %>%
      list()
  ) %>% 
  unnest_longer(c(quarter)) %>% 
  group_by(country_iso3c, party_name_short, party_id, quarter)

# temporarily keep for half-year computation
configs$halfyear <- configs$quarter

configs$quarter <- configs$quarter %>% 
  filter(n( ) > 1) %>% 
  mutate(
    r_ = row_number()
    , first_month = c(1, 4, 7, 10)[as.integer(substr(quarter, 6, 7))]
    , flag = if_else(
      # flag always one for first config
      r_ == 1
      , r_ 
      # for other configs, flag is 0 if occurs in first month of quarter
      , r_ * (month(start) != first_month)
    )
  ) %>% 
  # select config with most days in month
  top_n(1, wt = -flag) %>% 
  # select first config if tie
  top_n(1, wt = -r_) %>% 
  bind_rows(filter(configs$quarter, n() == 1)) %>% 
  ungroup() %>% 
  select(-start, -end, -r_, -first_month, -flag) %>% 
  arrange(country_iso3c, party_name_short, party_id, quarter)

# check  
configs$quarter %>% 
  group_by(country_iso3c, party_name_short, party_id, quarter) %>% 
  filter(n( ) > 1)

# party--half years
configs$halfyear <- configs$halfyear %>% 
  ungroup() %>% 
  mutate(halfyear = ifelse(grepl("-0[34]$", quarter), sub("-\\d+$" ,"-02", quarter), sub("-\\d+$" ,"-01", quarter))) %>% 
  select(-quarter) %>% 
  unique() %>% 
  group_by(country_iso3c, party_name_short, party_id, halfyear) 

configs$halfyear <- configs$halfyear %>% 
  filter(n( ) > 1) %>% 
  mutate(
    r_ = row_number()
    , first_month = c(1, 7)[as.integer(substr(halfyear, 6, 7))]
    , flag = if_else(
      # flag always one for first config
      r_ == 1
      , r_
      # for other configs, flag is 0 if occurs in first month of quarter
      , r_ * (month(start) > first_month+1)
    )
  ) %>% 
  # select ongoing config in half-year if following config does not start within the first 2 month of the half-year
  top_n(1, wt = -flag) %>% 
  # from remaining select first config if tie
  top_n(1, wt = -r_) %>% 
  bind_rows(filter(configs$halfyear, n( ) == 1)) %>% 
  ungroup() %>% 
  select(-start, -end, -r_, -first_month, -flag) %>% 
  arrange(country_iso3c, party_name_short, party_id, halfyear)

# check  
configs$halfyear %>% 
  group_by(country_iso3c, party_name_short, party_id, halfyear) %>% 
  filter(n( ) > 1)

# compute aggregate estimates for fixed-time windows ----
fixed_window_timeseries <- list()

# helper
compute_fixed_window_aggregates <- function(.f, .configs, filter.political = TRUE){
  # create temporary copy
  tmp <- ungroup(tweet_timeseries)
  
  # remove "non-political" tweets if desired
  if (filter.political) 
    tmp <- filter(tmp, political == "yes")
  
  # aggregate at level of `.f`
  out <- tmp %>%
    # group
    group_by_at(
      vars(
        country_iso3c
        , party_id, party_name_short
        , which(names(tmp) == .f)
      )
    ) %>% 
    # summarize
    summarise(
      # collect user IDs at party level (usually N=1)
      user_ids = list(unique(user_id))
      # date of first recorded tweet in party-time unit
      , first_date = min(as_date(created_at))
      # No. tweets in party-time unit
      , n_tweets = n_distinct(status_id)
      # outcome class party-time unit aggregates
      , n_elitecriticism = sum(elitecriticism == "yes", na.rm = TRUE)
      , prop_elitecriticism = mean(elitecriticism == "yes", na.rm = TRUE)
      , mean_prob_elitecriticism = mean(prob_elitecriticism, na.rm = TRUE)
      , .groups = "keep"
    ) %>% 
    ungroup() 
  
  # right join complet schedule of party-time units to output
  right_join(out, .configs) %>% 
    mutate(n_tweets = ifelse(is.na(n_tweets), 0L, n_tweets)) %>% 
    # arrange by country, party, and time unit ID
    arrange(country_iso3c, party_name_short, !!.f)
}

# time units => window sizes
windows <- c("halfyear", "quarter", "month", "week")

# compute for each window size
for (f in windows) {
  message("\b\r", f)
  # without non-political tweets
  fixed_window_timeseries[[f]] <- compute_fixed_window_aggregates(f, configs[[f]])
}

# verify
fixed_window_timeseries$halfyear %>%
  group_by(country_iso3c, party_name_short, party_id, halfyear) %>% 
  filter(n( ) > 1)

fixed_window_timeseries$quarter %>%
  group_by(country_iso3c, party_name_short, party_id, quarter) %>% 
  filter(n( ) > 1)

fixed_window_timeseries$month %>%
  group_by(country_iso3c, party_name_short, party_id, month) %>% 
  filter(n( ) > 1)

fixed_window_timeseries$week %>%
  group_by(country_iso3c, party_name_short, party_id, week) %>% 
  filter(n( ) > 1)

# note: missingness higher for more granular time unites (because higher sparsity)
fixed_window_timeseries %>% 
  map_dfr(~count(., missing = ifelse(is.na(mean_prob_elitecriticism), "yes", "no")), .id = "level") %>% 
  pivot_wider(values_from = "n", names_from = "missing") %>% 
  mutate(missing_ratio = yes/(yes+no))

# inspect No. obs
map_int(fixed_window_timeseries, nrow)

# inter-metric correlations
map(
  fixed_window_timeseries
  , ~cor(.$mean_prob_elitecriticism, .$prop_elitecriticism, use = "pairwise.complete.obs")
)

fp <- file.path(output_path, "parl_party_tweets_elitecriticism_fixed_window_timeseries.rds")
if (!file.exists(fp))
  write_rds(fixed_window_timeseries, fp)
